package com.aroura.mycrawler;

import com.aroura.util.DateStringFormatter;
import lombok.Data;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.model.AfterExtractor;
import us.codecraft.webmagic.model.ConsolePageModelPipeline;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.Formatter;
import us.codecraft.webmagic.model.annotation.HelpUrl;
import us.codecraft.webmagic.model.annotation.TargetUrl;
import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline;
import us.codecraft.webmagic.processor.PageProcessor;

@TargetUrl("https://ask.hellobi.com/article/\\d+")
@HelpUrl("https://blog.hellobi.com/hot/weekly\\?page=\\d+")
@Data
public class BlogInfo implements AfterExtractor {

    private Integer id;

    @ExtractBy("//h1[@class='clearfix']/a/@href")
    private String url;

    @ExtractBy("//h1[@class='clearfix']/a/text()")
    private String title;

    @ExtractBy("//section[@class='sidebar']/div/div/a[@class='aw-user-name']/text()")
    private String author;

    @ExtractBy("//div[@class='row']/div/div/div/div/span/text()")
    private String readNum;

    @ExtractBy("//a[@class='agree']/b/text()")
    private String recommendNum;

    @ExtractBy("//section[@class='sidebar']/div/div/a[@class='aw-user-name']/@href")
    private String blogHomeUrl;

    @ExtractBy("//div[@class='aw-mod']/div/h2/text()")
    private String commentNum;

    @Formatter(value = "%s", formatter = DateStringFormatter.class)
    @ExtractBy("//time[@class='time']/text()")
    private String publishTime;

    @ExtractBy("//div[@class='message-content editor-style']")
    private String content;

    public static void main(String[] args) {
        Site site = Site.me().setRetryTimes(10)
                .setSleepTime(500)
                .setTimeOut(30000)
                .setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36");
        OOSpider.create(site, new ConsolePageModelPipeline(), BlogInfo.class)
//                .addPageModel(new JsonFilePageModelPipeline("./data/blogs.json"), BlogInfo.class)
                .addUrl("https://blog.hellobi.com/hot/weekly?page=1")
                .thread(8)
                .run();
    }

    @Override
    public void afterProcess(Page page) {
        page.putField("readNum", this.readNum.split(":")[1].trim());
        page.putField("commentNum", this.commentNum.substring(0, 1).trim());
    }
}
